BONUS: perform the gender-based study also on the Master students, as explained in 1. Use scatterplots to visually identify changes over time. Plot males and females with different colors -- can you spot different trends that match the results of your statistical tests?
In [ ]:
#use hypothetical data set generated in Question 2
# we were not able to generate the DataSet and therefore were not able to test this code
#this is our attemp at demonstrating the algorithim that we'd use to compute stats for part 3
In [ ]:
#Statistical Tests for Question 3 (Bonus)
#Find Students who completed masters
#Assumption: to complete the masters, we consider students with an Entry on Semester 1 and on Masters Project
#use if statement to find students who posses both
In [ ]:
#Calculate time at EPFL
#Formula: time at EPFL (time of Semester 1 - Time of Masters Project)
#Add a column of time at EPFL
[m,n] = DataSet.shape;
for i in (0,m):
DataSet.TimeSpent[i]=DataSet.StartDate[i]-DataSet.EndDate[i]; #assume we were able to create a column with the end date and start date of each student
In [ ]:
#Calculate Overall Average by Gender
Fem = 0; #initialize count for female/male entries
Male = 0;
TotFem = 0; #initialize sum of total time spent at EPFL
TotMale = 0;
for i in (0,m):
if DataSet[i,1] == 'Madame':
Fem = Fem + 1;
TotFem = TotFem + DataSet.TimeSpent[i];
elif DataSet[i,1] == 'Monsieur'
Male = Male + 1;
TotMale = TotMale + DataSet.TimeSpent[i];
MaleAvg = TotMale/Male;
FemAvg = TotFem/Fem;
Avgs = [MaleAvg FemAvg];
return Avgs
In [ ]:
# determine statistical significance using t-test
from scipy.stats import ttest_ind
Fem = DataSet[DataSet['Civilité']=='Madame']
Male = DataSet[DataSet['Civilité']=='Monsieur']
[p,e] = ttest_ind(Fem['TimeSpent'], Male['TimeSpent'])
if p<=0.05:
print "The difference in averages is statistically significance to a siginificance level of 95%"
else
print "The difference in averages is not statistically significance to a significance level of 95%"
In [ ]:
#Find the evolution of averages by gender
#Calculate the average stay at EPFL for people arriving at different years
FemArray = []; #initialize arrays to store average by starting year
MaleArray = [];
for i in (2007,2014):
YearAvg = DataSet[DataSet['StartDate']==i];
Fem2 = YearAvg[YearAvg['Civilité']=='Madame'];
Male2 = YearAvg[YearAvg['Civilité']=='Monsieur'];
MeanFem = mean(Fem2.TimeSpent);
MeanMale = mean(Male2.TimeSpent);
FemArray.extend(MeanFem);
MaleArray.extend(MeanMale);
In [ ]:
#create scatter plot of evolution with different colors by gender
import matplotlib.pyplot as plt
x = [2007:2014]; # set x axis array of starting dates
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.scatter(x, FemArray, c='r', label='Female')
ax1.scatter(x, MaleArray, c='b', label='Male')
plt.legend(loc='upper left');
plt.show()